This is a worked solution in Julia to the second part of Exercise 5 from Andrew Ng's Machine Learning course on Stanford's OpenClassroom. $ \newcommand{\cond}{{\mkern+2mu} \vert {\mkern+2mu}} \newcommand{\SetDiff}{\mathrel{\backslash}} \DeclareMathOperator{\BetaFunc}{Β} \DeclareMathOperator{\GammaFunc}{Γ} \DeclareMathOperator{\prob}{p} \DeclareMathOperator{\cost}{J} \DeclareMathOperator{\score}{V} \DeclareMathOperator{\gradient}{\nabla} \DeclareMathOperator{\Hessian}{H} \DeclareMathOperator{\dcategorical}{Categorical} \DeclareMathOperator{\dcategorical}{Categorical} \DeclareMathOperator{\ddirichlet}{Dirichlet} \DeclareMathOperator{\E}{e} $
Set up the Plots.jl package along with a simple scatter plot and probability contour overlay.
In [1]:
using Distributions
using Plots
gadfly()
function base_plot(U, V, Y)
scatter(
x = U, xlabel = "U", xlimits = (-1, 1.5),
y = V, ylabel = "V", ylimits = (-1, 1.5),
group = map((y) -> ["y = 0", "y = 1"][y+1], Y),
markercolor = [colorant"LightSkyBlue" colorant"Red"],
markersize = 4,
size = (550, 500) )
end
function contour_plot(U, V, Y, θ, λ)
plt = base_plot(U, V, Y)
plot!(
plt,
linspace(-1, 1.5, 200),
linspace(-1, 1.5, 200),
(x, y) -> h(map_feature(x, y), θ),
title = "λ = $λ" )
end;
The data for this exercise is downloaded and unpacked from ex5Data.zip.
In [2]:
Y = convert(AbstractVector{Int}, vec(readdlm("ex5Logy.dat")))
N = length(Y)
UV = readdlm("ex5Logx.dat", ',', Float64);
U = UV[:,1]
V = UV[:,2];
In [3]:
base_plot(U, V, Y)
Out[3]:
The predictors in this exercise will be every monomial of $U$ and $V$ up to the sixth power, $x = (1, u, v, u^2, uv, v^2, \dotsc, v^6)^{\text{T}}$.
In [4]:
function map_feature(feat1::Number, feat2::Number)
return vec(map_feature([feat1], [feat2]))
end
function map_feature(feat1::AbstractVector, feat2::AbstractVector)
degree = 6
out = ones(size(feat1))
for i = 1:degree
for j = 0:i
out = [out (feat1.^(i-j)) .* (feat2.^j)]
end
end
return out
end;
In [5]:
X = map_feature(U, V);
P = size(X, 2)
Out[5]:
The sigmoid function, $g(α) = \dfrac{1}{1 + \E^{-α}}$.
In [6]:
function sigmoid(α)
1 ./ (1 .+ exp(-α))
end;
The hypothesis function, $h_θ(x) = g(θ^{\text{T}}x) = \prob(y = 1 \cond x; θ)$.
In [7]:
function h(xₙ::AbstractVector, θ::AbstractVector)
sigmoid(dot(xₙ, θ))
end
function h(X::AbstractMatrix, θ::AbstractVector)
sigmoid(X * θ)
end;
The regularised cost function, $\cost(θ)$, is defined as $$ \cost(θ) = \frac{1}{N} \sum_{n=1}^N \big[ -y_n \log(h_θ(x_n)) - (1 - y_n) \log(1 - h_θ(x_n)) \big] + \frac{λ}{2N} θ^{\text{T}} θ. $$
In [8]:
function cost(X::AbstractMatrix, Y::AbstractVector, θ::AbstractVector, λ::Number)
N = length(Y)
hₓ = h(X, θ)
J = sum(-Y .* log(hₓ) - (1 .- Y) .* log(1 - hₓ)) / N
reg = λ/(2N) * dot(θ, θ)
return J + reg
end;
The gradient, $\gradient_θ\cost$, of the regularised cost function is
$$ \gradient_θ\cost = \begin{bmatrix} \frac{1}{N} \sum_{n=1}^N (h_θ(x_n) - y_n) x_{n0} \\ \frac{1}{N} \sum_{n=1}^N (h_θ(x_n) - y_n) x_{n1} + \frac{λ}{N}\theta_1 \\ \frac{1}{N} \sum_{n=1}^N (h_θ(x_n) - y_n) x_{n2} + \frac{λ}{N}\theta_2 \\ \vdots \\ \frac{1}{N} \sum_{n=1}^N (h_θ(x_n) - y_n) x_{nN} + \frac{λ}{N}\theta_N \\ \end{bmatrix}. $$
In [9]:
function gradient(X::AbstractMatrix, Y::AbstractVector, θ::AbstractVector, λ::Number)
N = length(Y)
hₓ = h(X, θ)
∇ = vec(sum((hₓ .- Y) .* X, 1)) ./ N
reg = λ .* θ ./ N
reg[1] = 0
return ∇ .+ reg
end;
The Hessian, $\Hessian$, of the regularised cost function is
$$ \Hessian = \frac{1}{N} \sum_{n=1}^N \big[ h_θ(x_n) (1 - h_θ(x_n)) x_n x_n^{\text{T}} \big] + \frac{λ}{N} \begin{bmatrix} 0 & & & \\ & 1 & & \\ & & \ddots & \\ & & & 1 \end{bmatrix}. $$
In [16]:
function Hessian(X::AbstractMatrix, Y::AbstractVector, θ::AbstractVector, λ::Number)
N = length(Y)
P = length(θ)
hₓ = h(X, θ)
σ² = hₓ .* (1 - hₓ)
H = zeros(Float64, (P, P))
for i in 1:N
H .+= σ²[i] .* X[i, :] * X[i, :]'
end
H ./= N
reg = λ/N .* eye(P)
reg[1, 1] = 0
return H .+ reg
end;
The update rule for Newton's method is $θ′ = θ - \Hessian^{-1} \gradient_θ \cost$.
In [17]:
function NewtonsStep(X::AbstractMatrix, Y::AbstractVector, θ::AbstractVector, λ::Number)
∇J = gradient(X, Y, θ, λ)
H = Hessian(X, Y, θ, λ)
θ -= H \ ∇J
J = cost(X, Y, θ, λ)
return (θ, J)
end
function Newtons(X::AbstractMatrix, Y::AbstractVector, θ::AbstractVector, λ::Number)
MAX_ITER = 1000
TOLERANCE = 1e-9
J = Array{Float64}(MAX_ITER)
J[1] = cost(X, Y, θ, λ)
for i in 2:MAX_ITER
(θ, J[i]) = NewtonsStep(X, Y, θ, λ)
if norm(J[i] - J[i-1]) < TOLERANCE
println("Converged after $i iterations")
return (θ, J[1:i])
end
end
println("Failed to converge after $MAX_ITER iterations")
return (θ, J)
end;
Now we can use Newton's method to solve the classification problem for any required value of the regularisation parameter, λ
. We use all zeros as the initial values of the parameters, θ
.
In [18]:
λ = 0.001
(θ, J) = Newtons(X, Y, rand(Normal(0, 0.001), (P,)), λ)
@show norm(θ);
A plot of the value of the cost function at each iteration demonstrates the speed of convergence.
In [19]:
plot(
J,
xlabel = "Iterations", ylabel = "Cost", legend = false,
size = (500, 350) )
Out[19]:
Given a solution to the classification problem, we can plot the decision boundary over the data. The plot here shows not the decision boundary itself but the contours of $\prob(y = 1 \cond x; θ)$, the class probabilities. The decision boundary is at $\prob(y = 1 \cond x; θ) = 0.5$, which corresponds to $θ^{\text{T}}x = 0$.
In [20]:
contour_plot(U, V, Y, θ, λ)
Out[20]: